Data from here: hhttps://osf.io/nbepu Original methods and full paper here: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0265662
figures <- "figures/"
library(stringr)
library(dplyr)
library(DescTools)
library(car)
require(foreign)
require(ggplot2)
require(Hmisc)
require(reshape2)
library(tidyverse)
library(broom)
library(aod)
library(glmtoolbox)
library(fmsb)
library(uwot)
library(ggthemes)
library(ggforce)
library(readxl) #leaves characters as characters
library(ROCR) # Use ROCR package to plot ROC curve & AUC
library(pROC)
library(caret)
library(xgboost)
library(smotefamily) # one method of ML data augmentation
library(rsample)
library(mltools) # Wrapper allows UMAP calculations
library(data.table)
library(heatmaply)
library(rstatix) # to calculate Kendall correlation with associated p-values.
library(IHW) # Bioconductor package to calculate BH threshold
library(dbscan)
library(corrplot)
library(showtext)
knitr::opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)
library(splines)
library(cluster)
library(factoextra)
library(clusterSim)
library(RColorBrewer)
# Save the data so can run subsequent analysis ad lib
SAVE = TRUE
# quicker version readxl
ml <- read_excel("Canine health results.xlsx", sheet = "All")
cols <- c("Col_a")
for (col in 2:ncol(ml)) {
if (col <= 26) {
cols <- cbind(cols, paste0("Col_", letters[col]))
} else if (col <= 52) {
cols <- cbind(cols, paste0("Col_a", letters[col - 26]))
} else {
cols <- cbind(cols, paste0("Col_b", letters[col - 52]))
}
}
names(ml) <- cols
Rename variables
ml <- dplyr::rename(ml, unique.resp.number = Col_a)
ml <- dplyr::rename(ml, Location = Col_c)
ml <- dplyr::rename(ml, setting = Col_e)
ml <- dplyr::rename(ml, Education = Col_g)
ml <- dplyr::rename(ml, Animal_Career = Col_h)
ml <- dplyr::rename(ml, Income = Col_i)
ml <- dplyr::rename(ml, C_Age = Col_j)
ml <- dplyr::rename(ml, C_Gender = Col_k)
ml <- dplyr::rename(ml, C_Diet = Col_l)
ml <- dplyr::rename(ml, Size = Col_o)
ml <- dplyr::rename(ml, D_Age = Col_p)
ml <- dplyr::rename(ml, D_Gender = Col_r)
ml <- dplyr::rename(ml, Therapeutic_Food = Col_s)
ml <- dplyr::rename(ml, Visits = Col_t)
ml <- dplyr::rename(ml, Vet_Health = Col_u)
ml <- dplyr::rename(ml, Meds = Col_ag)
ml <- dplyr::rename(ml, Health = Col_ah)
ml <- dplyr::rename(ml, D_Diet = Col_ai)
ml <- dplyr::rename(ml, Decision_Maker = Col_at)
head(ml)
ml$D_Sex <- rep("Male", nrow(ml))
for (i in 1:nrow(ml)) {
if (grepl("Male", ml$D_Gender[i], fixed = TRUE)) {
ml$D_Sex[i] <- "Male"
} else {
ml$D_Sex[i] <- "Female"
}
}
# Checks and Balances
dplyr::count(ml, D_Sex)
ml$D_Neuter <- rep("BLANK", nrow(ml))
for (i in 1:nrow(ml)) {
if (grepl("intact", ml$D_Gender[i], fixed = TRUE)) {
ml$D_Neuter[i] <- "Intact"
} else {
ml$D_Neuter[i] <- "Neutered"
}
}
dplyr::count(ml, D_Neuter)
dplyr::count(ml, D_Age)
ml <- subset(ml, D_Age != "<1" & D_Age != "Unsure")
ml$D_Age <- as.numeric(ml$D_Age)
dplyr::count(ml, D_Age)
uml <- ml
# Temporarily create the Owner Diet data labels for use
# later.
# Numeric -Euclidean
D_Age <- uml$D_Age
C_Age <- factor(uml$C_Age, ordered = TRUE, levels = c("18-19",
"20–29", "30–39", "40–49", "50–59", "60–69", ">70"))
# Ordered FALSE since TOY is not strictly smaller than
# Small, but different.
Size <- factor(uml$Size, ordered = FALSE, levels = c("Toy (≤ 4.1 kg or ≤ 9 lb)",
"Small (4.5–10.9 kg or 10–24 lb)", "Medium (11.4–22.3 kg or 25–49 lb)",
"Large (22.7–40.5 kg or 50–89 lb)", "Giant (> 40.9 kg or > 90 lb)"))
uml$Education <- factor(uml$Education, ordered = TRUE, levels = c("Did not complete high school",
"High school or equivalent", "College or University award lower than undergraduate degree",
"University undergraduate degree", "University postgraduate degree lower than doctoral degree e.g. masters",
"Doctoral degree"))
C_DietLabs <- uml$C_Diet
D_DietLabs <- uml$D_Diet
Edu_Labs <- uml$Education
Education <- uml$Education
# Not truely ordered because some 'prefer not to say'
Income <- as.factor(uml$Income)
# Factors even if binary - use Hamming
D_Sex <- as.factor(uml$D_Sex)
C_Gender <- as.factor(uml$C_Gender)
Neuter_Status <- as.factor(uml$D_Neuter)
Decision_Maker <- as.factor(uml$Decision_Maker)
setting <- as.factor(uml$setting)
Animal_Career <- as.factor(uml$Animal_Career)
Location <- as.factor(uml$Location)
D_Diet <- as.factor(uml$D_Diet)
C_Diet <- as.factor(uml$C_Diet)
# Reforming so we can use the explicitly set data types...
# Numeric Euclidean Ordered Manhattan Factor Hamming Factor
# Hamming Factor Hamming
Udata <- data.frame(cbind(D_Age, C_Age, Education, Size, Income,
Neuter_Status, D_Sex, C_Gender, Animal_Career, C_Diet, D_Diet,
setting, Decision_Maker, Location), stringsAsFactors = TRUE) #Should be no strings as already factors
# Labels
# if using uwot
targets <- c(rep("euclidean", 1), rep("manhattan", 2), rep("hamming",
11))
Udata$C_DietLabs <- C_DietLabs
uKeeper <- complete.cases(Udata)
Udata <- Udata[uKeeper, ]
CLabs <- Udata$C_DietLabs
Edu_Labs <- Edu_Labs[uKeeper]
# For other applications:
write.csv(Udata, "Udata.csv")
# Dont include the labels themselves in the UMAP
Udata <- dplyr::select(Udata, -C_DietLabs)
# nice-ify labels
CLabs <- stringr::str_replace(CLabs, "Vegetarian \\(consuming plants, eggs and milk\\)",
"Vegetarian")
CLabs <- stringr::str_replace(CLabs, "Omnivore \\(i.e. eat meat\\)",
"Omnivore")
CLabs <- stringr::str_replace(CLabs, "Omnivore reducing animal product consumption",
"Omnivore \\(Reducing\\)")
CLabs <- stringr::str_replace(CLabs, "Vegan \\(consuming no animal products\\)",
"Vegan")
CLabs <- stringr::str_replace(CLabs, "Pescatarian \\(including fish but no other meats\\)",
"Pescatarian")
# Set up for doggie
Labs <- D_DietLabs[uKeeper] #remember we complete cased above!
# nice-ify labels
Labs <- stringr::str_replace(Labs, "Meat-based – conventional",
"Meat-based")
Labs <- stringr::str_replace(Labs, "Vegan \\(consuming no animal products\\)",
"Vegan")
Labs <- stringr::str_replace(Labs, "Meat-based – lab-grown",
"Lab-grown")
Labs <- stringr::str_replace(Labs, "Meat-based – raw", "Raw food")
Labs <- stringr::str_replace(Labs, "Vegetarian \\(including eggs or milk, but not meat\\)",
"Vegetarian")
#Perform UMAP run
# Use UWOT library rather than UMAP since has built in
# categorical support hyperparameters, largely defaults
n_epochs = 50000
n_components = 2
n_n = 20
learning_rate = 0.01
scale = "maxabs"
spread = 1
min_dist = 0.001
set_op_mix_ratio = 1
local_con = 1
bandwidth = 1
repulsion_strength = 1
neg = 5
n_trees = 50
target_weight = 0.5
init = "normlaplacian"
set.seed(666)
um <- uwot::umap(Udata, target_metric = targets, spread = spread,
n_neighbors = n_n, min_dist = min_dist, negative_sample_rate = neg,
n_epochs = n_epochs, local_connectivity = local_con, n_trees = n_trees,
scale = scale)
df = as.data.frame(cbind(um[, 1], um[, 2]))
names(df) <- c("x", "y")
ggplot(df, aes(x, y)) + geom_point(aes(fill = Labs), alpha = 0.8,
shape = 21, size = 2, stroke = 0.2) + labs(fill = "Dog Diet") +
xlab("UMAP X co-ordinates") + ylab("UMAP Y co-ordinates") +
# geom_segment(aes(y=bottom,yend=top,x=-Inf,xend=-Inf))+
# geom_segment(aes(y=-Inf,yend=-Inf,x=left,xend=right))+
# scale_y_continuous(breaks=seq(bottom,top,2))+
# scale_x_continuous(breaks=seq(left,r-ight,2))+
geom_rangeframe() + theme_tufte(base_family = "Gill Sans")
fname = paste0("umaps/", "i", i, "min_dist", min_dist, "_spread",
spread, "_nn", n_n, "_lcon", local_con, "_neg", neg, "_epochs",
n_epochs, "_trees", n_trees, "_", scale, ".png")
# Test Umap plot including hyperparameters ggsave(fname)
#Perform density clustering analysis ##And Dog clustering metrics
suppressWarnings(file.remove("dog_diet_clusters.csv"))
## [1] TRUE
set.seed(666)
db <- dbscan(um, eps = 0.7, minPts = 50) #eps=1 minPts=5 db <- dbscan(um, eps=0.8, minPts = 25) dbscan(um, eps=0.86, minPts = 46)
print(db)
## DBSCAN clustering for 2609 objects.
## Parameters: eps = 0.7, minPts = 50
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 6 cluster(s) and 0 noise points.
##
## 1 2 3 4 5 6
## 674 374 612 505 130 314
##
## Available fields: cluster, eps, minPts, dist, borderPoints
combi <- cbind(df, db$cluster, Labs)
names(combi) <- c("x", "y", "cluster", "Diet")
print(paste("total observations = ", nrow(combi)))
## [1] "total observations = 2609"
output <- dplyr::count(combi, Diet, name = "All Data")
print(output)
## Diet All Data
## 1 Fungi-based 1
## 2 Insect-based 6
## 3 Lab-grown 7
## 4 Meat-based 1368
## 5 Mixture 16
## 6 Raw food 830
## 7 Unsure 10
## 8 Vegan 336
## 9 Vegetarian 35
write.table(output, "dog_diet_clusters.csv", sep = ",", row.names = FALSE,
append = TRUE)
centeroids <- data.frame(mx = as.numeric(), my = as.numeric())
for (CLUSTER in seq(0, max(db$cluster))) {
print(paste("cluster", CLUSTER))
this_clust <- subset(combi, cluster == CLUSTER)
print(paste("total observations in cluster", CLUSTER, " = ",
nrow(this_clust)))
output <- dplyr::count(this_clust, Diet, name = paste("Cluster",
CLUSTER))
print(output)
write.table(output, "dog_diet_clusters.csv", sep = ",", row.names = FALSE,
append = TRUE)
if (CLUSTER > 0) {
mx <- mean(this_clust$x)
my <- mean(this_clust$y)
lc <- data.frame(mx = mx, my = my)
centeroids <- rbind(centeroids, lc)
print(paste("Cluster", CLUSTER, "centeroid is", round(mx,
2), round(my, 2)))
}
}
## [1] "cluster 0"
## [1] "total observations in cluster 0 = 0"
## [1] Diet Cluster 0
## <0 rows> (or 0-length row.names)
## [1] "cluster 1"
## [1] "total observations in cluster 1 = 674"
## Diet Cluster 1
## 1 Lab-grown 3
## 2 Meat-based 571
## 3 Raw food 100
## [1] "Cluster 1 centeroid is -1.19 -5.04"
## [1] "cluster 2"
## [1] "total observations in cluster 2 = 374"
## Diet Cluster 2
## 1 Mixture 9
## 2 Unsure 3
## 3 Vegan 331
## 4 Vegetarian 31
## [1] "Cluster 2 centeroid is 6.22 14.99"
## [1] "cluster 3"
## [1] "total observations in cluster 3 = 612"
## Diet Cluster 3
## 1 Fungi-based 1
## 2 Insect-based 5
## 3 Lab-grown 2
## 4 Meat-based 424
## 5 Mixture 4
## 6 Raw food 176
## [1] "Cluster 3 centeroid is -0.14 2.71"
## [1] "cluster 4"
## [1] "total observations in cluster 4 = 505"
## Diet Cluster 4
## 1 Insect-based 1
## 2 Lab-grown 2
## 3 Meat-based 277
## 4 Mixture 2
## 5 Raw food 221
## 6 Unsure 1
## 7 Vegetarian 1
## [1] "Cluster 4 centeroid is -4.79 -3.69"
## [1] "cluster 5"
## [1] "total observations in cluster 5 = 130"
## Diet Cluster 5
## 1 Meat-based 96
## 2 Raw food 33
## 3 Unsure 1
## [1] "Cluster 5 centeroid is 0.52 -1.5"
## [1] "cluster 6"
## [1] "total observations in cluster 6 = 314"
## Diet Cluster 6
## 1 Mixture 1
## 2 Raw food 300
## 3 Unsure 5
## 4 Vegan 5
## 5 Vegetarian 3
## [1] "Cluster 6 centeroid is 2.9 -5.75"
##Owner Diet clustering metrics
suppressWarnings(file.remove("owner_diet_clusters.csv"))
## [1] TRUE
combi <- cbind(df, db$cluster, CLabs)
names(combi) <- c("x", "y", "cluster", "C_Diet")
print(paste("total observations = ", nrow(combi)))
## [1] "total observations = 2609"
output <- dplyr::count(combi, C_Diet, name = "All Data")
print(output)
## C_Diet All Data
## 1 Omnivore 1049
## 2 Omnivore (Reducing) 562
## 3 Other 20
## 4 Pescatarian 133
## 5 Vegan 581
## 6 Vegetarian 264
write.table(output, "owner_diet_clusters.csv", sep = ",", row.names = FALSE,
append = TRUE)
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = FALSE, : appending column names to file
for (CLUSTER in seq(0, max(db$cluster))) {
print(paste("cluster", CLUSTER))
this_clust <- subset(combi, cluster == CLUSTER)
print(paste("total observations in cluster", CLUSTER, " = ",
nrow(this_clust)))
output <- dplyr::count(this_clust, C_Diet, name = paste("Cluster",
CLUSTER))
print(output)
write.table(output, "owner_diet_clusters.csv", sep = ",",
row.names = TRUE, append = TRUE)
}
## [1] "cluster 0"
## [1] "total observations in cluster 0 = 0"
## [1] C_Diet Cluster 0
## <0 rows> (or 0-length row.names)
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 1"
## [1] "total observations in cluster 1 = 674"
## C_Diet Cluster 1
## 1 Omnivore 434
## 2 Omnivore (Reducing) 235
## 3 Other 4
## 4 Pescatarian 1
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 2"
## [1] "total observations in cluster 2 = 374"
## C_Diet Cluster 2
## 1 Omnivore 2
## 2 Omnivore (Reducing) 9
## 3 Other 1
## 4 Pescatarian 9
## 5 Vegan 331
## 6 Vegetarian 22
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 3"
## [1] "total observations in cluster 3 = 612"
## C_Diet Cluster 3
## 1 Other 1
## 2 Pescatarian 119
## 3 Vegan 250
## 4 Vegetarian 242
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 4"
## [1] "total observations in cluster 4 = 505"
## C_Diet Cluster 4
## 1 Omnivore 333
## 2 Omnivore (Reducing) 166
## 3 Other 6
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 5"
## [1] "total observations in cluster 5 = 130"
## C_Diet Cluster 5
## 1 Omnivore 66
## 2 Omnivore (Reducing) 57
## 3 Other 5
## 4 Pescatarian 2
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 6"
## [1] "total observations in cluster 6 = 314"
## C_Diet Cluster 6
## 1 Omnivore 214
## 2 Omnivore (Reducing) 95
## 3 Other 3
## 4 Pescatarian 2
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
##Owner Education clustering metrics
suppressWarnings(file.remove("owner_edu_clusters.csv"))
## [1] TRUE
combi <- cbind(df, db$cluster, Edu_Labs)
names(combi) <- c("x", "y", "cluster", "Education")
print(paste("total observations = ", nrow(combi)))
## [1] "total observations = 2609"
output <- dplyr::count(combi, Education, name = "All Data")
print(output)
## Education
## 1 Did not complete high school
## 2 High school or equivalent
## 3 College or University award lower than undergraduate degree
## 4 University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6 Doctoral degree
## All Data
## 1 42
## 2 478
## 3 746
## 4 740
## 5 511
## 6 92
write.table(output, "owner_edu_clusters.csv", sep = ",", row.names = FALSE,
append = TRUE)
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## FALSE, : appending column names to file
for (CLUSTER in seq(0, max(db$cluster))) {
print(paste("cluster", CLUSTER))
this_clust <- subset(combi, cluster == CLUSTER)
print(paste("total observations in cluster", CLUSTER, " = ",
nrow(this_clust)))
output <- dplyr::count(this_clust, Education, name = paste("Cluster",
CLUSTER))
print(output)
write.table(output, "owner_edu_clusters.csv", sep = ",",
row.names = TRUE, append = TRUE)
}
## [1] "cluster 0"
## [1] "total observations in cluster 0 = 0"
## [1] Education Cluster 0
## <0 rows> (or 0-length row.names)
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 1"
## [1] "total observations in cluster 1 = 674"
## Education
## 1 Did not complete high school
## 2 High school or equivalent
## 3 College or University award lower than undergraduate degree
## 4 University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6 Doctoral degree
## Cluster 1
## 1 10
## 2 143
## 3 199
## 4 182
## 5 121
## 6 19
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 2"
## [1] "total observations in cluster 2 = 374"
## Education
## 1 Did not complete high school
## 2 High school or equivalent
## 3 College or University award lower than undergraduate degree
## 4 University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6 Doctoral degree
## Cluster 2
## 1 3
## 2 51
## 3 74
## 4 125
## 5 101
## 6 20
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 3"
## [1] "total observations in cluster 3 = 612"
## Education
## 1 Did not complete high school
## 2 High school or equivalent
## 3 College or University award lower than undergraduate degree
## 4 University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6 Doctoral degree
## Cluster 3
## 1 5
## 2 118
## 3 160
## 4 152
## 5 152
## 6 25
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 4"
## [1] "total observations in cluster 4 = 505"
## Education
## 1 Did not complete high school
## 2 High school or equivalent
## 3 College or University award lower than undergraduate degree
## 4 University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6 Doctoral degree
## Cluster 4
## 1 14
## 2 97
## 3 181
## 4 139
## 5 66
## 6 8
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 5"
## [1] "total observations in cluster 5 = 130"
## Education
## 1 Did not complete high school
## 2 High school or equivalent
## 3 College or University award lower than undergraduate degree
## 4 University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6 Doctoral degree
## Cluster 5
## 1 2
## 2 10
## 3 29
## 4 50
## 5 26
## 6 13
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 6"
## [1] "total observations in cluster 6 = 314"
## Education
## 1 Did not complete high school
## 2 High school or equivalent
## 3 College or University award lower than undergraduate degree
## 4 University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6 Doctoral degree
## Cluster 6
## 1 8
## 2 59
## 3 103
## 4 92
## 5 45
## 6 7
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
#Pull out the centeroids
p <- ggplot(df, aes(x, y)) + geom_point(aes(fill = Labs), alpha = 0.8,
shape = 21, size = 2, stroke = 0.2) + labs(fill = "Dog Diet") +
xlab("UMAP X co-ordinates") + ylab("UMAP Y co-ordinates") +
# geom_segment(aes(y=bottom,yend=top,x=-Inf,xend=-Inf))+
# geom_segment(aes(y=-Inf,yend=-Inf,x=left,xend=right))+
# scale_y_continuous(breaks=seq(bottom,top,2))+
# scale_x_continuous(breaks=seq(left,r-ight,2))+
geom_rangeframe() + theme_tufte(base_family = "Gill Sans") +
geom_mark_circle(data = centeroids, aes(x = mx, y = my, group = rownames(centeroids)),
fill = "white", alpha = 0.7, color = "black", con.type = "none",
label.fill = "white", show.legend = FALSE, inherit.aes = FALSE) +
geom_text(data = centeroids, aes(mx, my, label = rownames(centeroids)),
size = 10, color = "black")
fname = paste0("umaps/", "i", i, "min_dist", min_dist, "_spread",
spread, "_nn", n_n, "_lcon", local_con, "_neg", neg, "_epochs",
n_epochs, "_trees", n_trees, "_", scale, ".png")
print(p)
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# ggsave(fname)
#Calculate silhoutte and cluster metric
set.seed(1234)
sil <- silhouette(db$cluster, dist(dplyr::select(combi, x, y)))
sil <- fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 674 0.35
## 2 2 374 0.92
## 3 3 612 0.64
## 4 4 505 0.59
## 5 5 130 0.85
## 6 6 314 0.80
sil_df <- sil$data
output <- data.frame(Cluster = as.numeric(), Score = as.numeric(),
mx = as.numeric(), my = as.numeric())
mx <- 0
snx <- 0
for (cluster in unique(sil_df$cluster)) {
c <- as.character(cluster)
Score <- mean(subset(sil_df, cluster == c)$sil_width)
my <- Score
# halfway long the cluster
nx <- nrow(subset(sil_df, cluster == c))
mx <- snx + nx/2
snx <- snx + nx
print(paste(cluster, round(Score, 3), nx))
local <- data.frame(Cluster = as.numeric(as.character(cluster)),
Score = round(Score, 3), mx = mx, my = my)
output <- rbind(output, local)
}
## [1] "1 0.347 674"
## [1] "2 0.92 374"
## [1] "3 0.639 612"
## [1] "4 0.593 505"
## [1] "5 0.853 130"
## [1] "6 0.799 314"
head(output)
write.csv(output, "sils.csv")
output <- subset(output, Cluster > 0)
p <- sil + geom_mark_circle(data = output, aes(x = mx, y = my,
group = rownames(output)), fill = "white", alpha = 0.7, color = "black",
con.type = "none", label.fill = "white", show.legend = FALSE,
inherit.aes = FALSE) + geom_text(data = output, aes(mx, my,
label = round(Score, 2)), size = 3, color = "black")
print(p)
ggsave(paste0(figures, "fviz_silhouette.tiff"), width = 5, height = 4,
dpi = 300, compression = "lzw")
#Davies-Bouldin
result <- index.DB(dplyr::select(combi, x, y), combi$cluster,
centrotypes = "centroids")
# print the Davies-Bouldin index
print(paste("Davies-Bouldin index is ", round(result$DB, 4)))
## [1] "Davies-Bouldin index is 0.547"
Labs <- D_DietLabs[uKeeper] #remember we complete cased above!
# nice-ify labels
Labs <- stringr::str_replace(Labs, "Meat-based – conventional",
"Meat-based")
Labs <- stringr::str_replace(Labs, "Vegan \\(consuming no animal products\\)",
"Vegan")
Labs <- stringr::str_replace(Labs, "Meat-based – lab-grown",
"Lab-grown")
Labs <- stringr::str_replace(Labs, "Meat-based – raw", "Raw food")
Labs <- stringr::str_replace(Labs, "Vegetarian \\(including eggs or milk, but not meat\\)",
"Vegetarian")
top = 8
bottom = -6
left = -6
right = 10
ggplot(df, aes(x, y)) + geom_point(aes(fill = Labs), alpha = 0.8,
shape = 21, size = 3, stroke = 0.2) + labs(fill = "Dog Diet") +
xlab("UMAP X co-ordinates") + ylab("UMAP Y co-ordinates") +
geom_segment(aes(y = bottom, yend = top, x = -Inf, xend = -Inf)) +
geom_segment(aes(y = -Inf, yend = -Inf, x = left, xend = right)) +
scale_fill_manual(values = rainbow(9)) + scale_y_continuous(breaks = seq(bottom,
top, 2)) + scale_x_continuous(breaks = seq(left, right, 2)) +
geom_rangeframe() + theme_tufte(base_family = "Gill Sans",
base_size = 18) + theme(legend.title = element_text(size = 18),
legend.text = element_text(size = 12)) + geom_rangeframe() +
geom_mark_circle(data = centeroids, aes(x = mx, y = my, group = rownames(centeroids)),
fill = "white", alpha = 0.3, color = "black", con.type = "none",
label.fill = "white", show.legend = FALSE, inherit.aes = FALSE) +
geom_text(data = centeroids, aes(mx, my, label = rownames(centeroids)),
size = 8, color = "black")
ggsave(paste0(figures, "UMAP_Clear_Dog.TIFF"), compression = "lzw")
cropbraket <- function(mystring) {
newstring <- unlist(str_split(mystring, " \\("))[1]
}
plotum <- function(labels, lcat, df) {
# number of legend rows
lrow = 2
if (length(unique(labels)) > 3) {
lrow = 3
}
if (length(unique(labels)) > 6) {
lrow = 4
}
ldf <- cbind(df, labels)
ldf <- ldf[complete.cases(ldf), ]
ldf <- ldf[order(ldf[, "labels"]), ]
labels <- ldf$labels
# Pretty the legend key
labels <- str_replace(labels, "Prefer not to answer", "Not given")
if (lcat == "D_Age")
lcat = "Dog Age"
if (lcat == "C_Age")
lcat = "Owner Age"
if (lcat == "Neuter_Status")
lcat = "Neuter Status"
if (lcat == "D_Sex")
lcat = "Dog Sex"
if (lcat == "C_Gender")
lcat = "Owner Gender"
if (lcat == "C_Diet") {
lcat = "Owner Diet"
# nice-ify labels
labels <- stringr::str_replace(labels, "Vegetarian \\(consuming plants, eggs and milk\\)",
"Vegetarian")
labels <- stringr::str_replace(labels, "Omnivore \\(i.e. eat meat\\)",
"Omnivore")
labels <- stringr::str_replace(labels, "Omnivore reducing animal product consumption",
"Reduced Meat")
labels <- stringr::str_replace(labels, "Vegan \\(consuming no animal products\\)",
"Vegan")
labels <- stringr::str_replace(labels, "Pescatarian \\(including fish but no other meats\\)",
"Pescatarian")
order <- as.numeric(labels == "Vegan")
ldf$order <- order
ldf <- ldf[order(ldf[, "order"]), ]
# Try and get the colours to match the Dog equiv
tempcols <- (scales::hue_pal())(n = 9)
colours <- tempcols[4:9]
}
if (lcat == "D_Diet")
return(NULL)
if (lcat == "Animal_Career") {
lcat = "Animal Career"
labels <- str_replace(labels, "None of the above", "None")
labels <- str_replace(labels, "Pet industry worker",
"Pet industry")
labels <- str_replace(labels, "Animal trainer", "Trainer")
labels <- str_replace(labels, "Veterinary technician or nurse",
"Vet Tech/Nurse")
labels <- str_replace(labels, "Animal breeder", "Breeder")
}
if (lcat == "Size")
labels <- sapply(labels, cropbraket)
if (lcat == "Decision_Maker") {
lcat = "Decision Maker"
labels <- str_replace(labels, "Primary decision-maker",
"Primary")
labels <- str_replace(labels, "Play some lesser role",
"Some Role")
labels <- str_replace(labels, "Play no role", "No Role")
}
if (lcat == "Location") {
labels <- str_replace(labels, "North America", "N. America")
labels <- str_replace(labels, "Other European", "Other Euro.")
labels <- str_replace(labels, "Australia/New Zealand/Oceania",
"Oceania")
}
if (lcat == "setting") {
lcat = "Setting"
labels <- str_replace(labels, "Equally urban and rural",
"Equal U & R")
}
if (lcat == "Education") {
labels <- str_replace(labels, "Did not complete high school",
"0_Basic")
labels <- str_replace(labels, "High school or equivalent",
"1_HighSchool")
labels <- str_replace(labels, "College or University award lower than undergraduate degree",
"2_College")
labels <- str_replace(labels, "University undergraduate degree",
"3_Grad")
labels <- str_replace(labels, "University postgraduate degree lower than doctoral degree e.g. masters",
"4_PG")
labels <- str_replace(labels, "Doctoral degree", "5_PhD")
}
labels <- as.factor(str_trunc(labels, 12, "right", ""))
if (lcat == "Dog Age")
labels <- as.numeric(labels)
# ldf <- dplyr::select(ldf,-labels)
# Put the labels back in!
ldf$labels <- labels
# remove this version to remove confusion
rm(labels)
top = 8
bottom = -6
left = -6
right = 10
p <- ggplot()
if (lcat == "Dog Diet") {
p <- p + geom_point(data = subset(ldf, !(labels %in%
c("Vegan"))), aes(x = x, y = y, fill = labels), alpha = 0.8,
shape = 21, size = 3, stroke = 0.2) + geom_point(data = subset(ldf,
labels == "Vegan"), aes(x = x, y = y, fill = labels),
alpha = 0.8, shape = 21, size = 3, stroke = 0.2)
} else if (lcat == "Decision Maker") {
p <- p + geom_point(data = subset(ldf, !(labels %in%
c("No Role", "Some Role"))), aes(x = x, y = y, fill = labels),
alpha = 0.8, shape = 21, size = 3, stroke = 0.2) +
geom_point(data = subset(ldf, labels == "No Role"),
aes(x = x, y = y, fill = labels), alpha = 0.8,
shape = 21, size = 3, stroke = 0.2) + geom_point(data = subset(ldf,
labels == "Some Role"), aes(x = x, y = y, fill = labels),
alpha = 0.8, shape = 21, size = 3, stroke = 0.2)
} else {
p <- p + geom_point(data = ldf, aes(x = x, y = y, fill = labels),
alpha = 0.8, shape = 21, size = 3, stroke = 0.2)
}
p <- p + labs(fill = lcat) + xlab("UMAP X co-ordinates") +
ylab("UMAP Y co-ordinates")
if (lcat == "Dog Age") {
p <- p + scale_fill_gradientn(colors = rainbow(8))
} else if (lcat == "Owner Diet") {
p <- p + scale_fill_manual(values = colours, limits = unique(ldf$labels))
}
p <- p + geom_segment(aes(y = bottom, yend = top, x = -Inf,
xend = -Inf)) + geom_segment(aes(y = -Inf, yend = -Inf,
x = left, xend = right)) + scale_y_continuous(breaks = seq(bottom,
top, 2)) + scale_x_continuous(breaks = seq(left, right,
2)) + geom_rangeframe() + theme_tufte(base_family = "Gill Sans",
base_size = 18) + theme(legend.title = element_text(size = 18),
legend.text = element_text(size = 12)) + geom_mark_circle(data = centeroids,
aes(x = mx, y = my, group = rownames(centeroids)), fill = "white",
alpha = 0.3, color = "black", con.type = "none", label.fill = "white",
show.legend = FALSE, inherit.aes = FALSE) + geom_text(data = centeroids,
aes(mx, my, label = rownames(centeroids)), size = 8,
color = "black")
print(p)
ggsave(paste0(figures, "UMAP_", lcat, ".TIFF"), compression = "lzw")
return(ldf)
}
for (label in names(Udata)) {
print(paste("starting", label))
if (label == "Neuter_Status") {
plotum(uml[["D_Neuter"]][uKeeper], label, df)
} else {
DF <- plotum(uml[[label]][uKeeper], label, df)
}
print(paste("completed", label))
}
## [1] "starting D_Age"
## [1] "completed D_Age"
## [1] "starting C_Age"
## [1] "completed C_Age"
## [1] "starting Education"
## [1] "completed Education"
## [1] "starting Size"
## [1] "completed Size"
## [1] "starting Income"
## [1] "completed Income"
## [1] "starting Neuter_Status"
## [1] "completed Neuter_Status"
## [1] "starting D_Sex"
## [1] "completed D_Sex"
## [1] "starting C_Gender"
## [1] "completed C_Gender"
## [1] "starting Animal_Career"
## [1] "completed Animal_Career"
## [1] "starting C_Diet"
## [1] "completed C_Diet"
## [1] "starting D_Diet"
## [1] "completed D_Diet"
## [1] "starting setting"
## [1] "completed setting"
## [1] "starting Decision_Maker"
## [1] "completed Decision_Maker"
## [1] "starting Location"
## [1] "completed Location"
ml <- dplyr::select(ml, c(unique.resp.number, Location, setting,
Education, Animal_Career, Income, C_Age, C_Gender, C_Diet,
Size, D_Age, D_Gender, D_Sex, D_Neuter, Therapeutic_Food,
Visits, Vet_Health, Meds, Health, D_Diet, Decision_Maker))
# check columns
head(ml)
ml <- subset(ml, D_Age != "NA")
ml$D_Age_quant <- cut(ml$D_Age, breaks = quantile(ml$D_Age, probs = seq(0,
1, 0.2)), na.rm = TRUE, include.lowest = TRUE)
dplyr::count(ml, D_Age_quant)
ml$D_Sex <- as.factor(ml$D_Sex)
# Checks and Balances
dplyr::count(ml, D_Sex)
ml$D_Neuter <- as.factor(ml$D_Neuter)
# Checks and Balances
dplyr::count(ml, D_Neuter)
dplyr::count(ml, Location)
for (i in 1:nrow(ml)) {
if (grepl("UK", ml$Location[i], fixed = TRUE)) {
ml$Location[i] <- "UK"
} else if (grepl("Other European", ml$Location[i], fixed = TRUE)) {
ml$Location[i] <- "Other European"
} else if (grepl("North America", ml$Location[i], fixed = TRUE)) {
ml$Location[i] <- "North America"
} else if (grepl("Australia/New Zealand/Oceania", ml$Location[i],
fixed = TRUE)) {
ml$Location[i] <- "Australia/New Zealand/Oceania"
} else {
ml$Location[i] <- "Other"
}
}
ml$Location <- factor(ml$Location, levels = c("UK", "Other European",
"North America", "Australia/New Zealand/Oceania", "Other"))
dplyr::count(ml, Location)
# Create a binary Loaction_Europe variable
ml$Location_Europe <- ml$Location
ml$Location_Europe <- as.character(ml$Location_Europe)
for (i in 1:nrow(ml)) {
if (grepl("Other European", ml$Location_Europe[i], fixed = TRUE)) {
ml$Location_Europe[i] <- "Yes"
} else {
ml$Location_Europe[i] <- "No"
}
}
dplyr::count(ml, Location_Europe)
dplyr::count(ml, setting)
ml <- subset(ml, setting != "Other" | setting != NA)
ml$setting <- as.factor(ml$setting)
dplyr::count(ml, setting)
# reorder categories so Urban is REF
ml$setting <- factor(ml$setting, levels = c("Urban", "Rural",
"Equally urban and rural"))
dplyr::count(ml, setting)
# Create a binary URBAN variable
ml$Urban <- ml$setting
ml$Urban <- as.character(ml$Urban)
for (i in 1:nrow(ml)) {
if (grepl("Urban", ml$Urban[i], fixed = TRUE)) {
ml$Urban[i] <- "Yes"
} else {
ml$Urban[i] <- "No"
}
}
dplyr::count(ml, Urban)
dplyr::count(ml, Education)
Qualifications = c("0_Basic", "1_HighSchool", "2_College", "3_Grad",
"4_PG", "5_PhD")
ml$Education <- str_replace(ml$Education, "Did not complete high school",
"0_Basic")
ml$Education <- str_replace(ml$Education, "High school or equivalent",
"1_HighSchool")
ml$Education <- str_replace(ml$Education, "College or University award lower than undergraduate degree",
"2_College")
ml$Education <- str_replace(ml$Education, "University undergraduate degree",
"3_Grad")
ml$Education <- str_replace(ml$Education, "University postgraduate degree lower than doctoral degree e.g. masters",
"4_PG")
ml$Education <- str_replace(ml$Education, "Doctoral degree",
"5_PhD")
ml$Education <- ordered(ml$Education)
dplyr::count(ml, Education)
#Simplify education further as small group sizes
Qualifications_S = c("0_Basic_or_HighSchool", "1_College", "2_Grad",
"3_PG_or_PhD")
ml$Education_S <- ml$Education
ml$Education_S <- str_replace(ml$Education_S, "0_Basic", "0_Basic_or_HighSchool")
ml$Education_S <- str_replace(ml$Education_S, "1_HighSchool",
"0_Basic_or_HighSchool")
ml$Education_S <- str_replace(ml$Education_S, "2_College", "1_College")
ml$Education_S <- str_replace(ml$Education_S, "3_Grad", "2_Grad")
ml$Education_S <- str_replace(ml$Education_S, "4_PG", "3_PG_or_PhD")
ml$Education_S <- str_replace(ml$Education_S, "5_PhD", "3_PG_or_PhD")
dplyr::count(ml, Education_S)
ml$Education_S2 <- factor(ml$Education_S, ordered = FALSE)
dplyr::count(ml, Education_S2)
dplyr::count(ml, Animal_Career)
ml$Animal_Career2 <- ml$Animal_Career
for (i in 1:nrow(ml)) {
if (grepl("Vet", ml$Animal_Career2[i], fixed = TRUE)) {
ml$Animal_Career2[i] <- "Vet professional"
} else if (grepl("Animal breeder", ml$Animal_Career2[i], fixed = TRUE)) {
ml$Animal_Career2[i] <- "breeder/trainer"
} else if (grepl("Animal trainer", ml$Animal_Career2[i], fixed = TRUE)) {
ml$Animal_Career2[i] <- "breeder/trainer"
} else if (grepl("Pet industry worker", ml$Animal_Career2[i],
fixed = TRUE)) {
ml$Animal_Career2[i] <- "Pet industry worker"
} else {
ml$Animal_Career2[i] <- "None of the above"
}
}
# reorder categories so None of the above is REF
ml$Animal_Career2 <- factor(ml$Animal_Career2, levels = c("None of the above",
"Vet professional", "breeder/trainer", "Pet industry worker"))
dplyr::count(ml, Animal_Career2)
ml$Animal_Career_BINARY <- ml$Animal_Career
for (i in 1:nrow(ml)) {
if (grepl("None of the above", ml$Animal_Career_BINARY[i],
fixed = TRUE)) {
ml$Animal_Career_BINARY[i] <- "No"
} else {
ml$Animal_Career_BINARY[i] <- "Yes"
}
}
dplyr::count(ml, Animal_Career_BINARY)
dplyr::count(ml, Income)
ml <- subset(ml, Income != "Prefer not to answer" | Income !=
NA)
ml$Income <- ordered(ml$Income, levels = c("Low", "Medium", "High"))
dplyr::count(ml, Income)
ml$Income2 <- factor(ml$Income, ordered = FALSE)
dplyr::count(ml, Income2)
dplyr::count(ml, C_Age)
ml$C_Age <- str_replace(ml$C_Age, "18-19", "<30")
ml$C_Age <- str_replace(ml$C_Age, "20–29", "<30")
ml$C_Age <- str_replace(ml$C_Age, ">70", "60<")
ml$C_Age <- str_replace(ml$C_Age, "60–69", "60<")
ml$C_Age <- ordered(ml$C_Age)
# Check right order
ml$C_Age[nrow(ml)]
## [1] 60<
## Levels: <30 < 30–39 < 40–49 < 50–59 < 60<
dplyr::count(ml, C_Age)
ml$C_Age2 <- factor(ml$C_Age, ordered = FALSE)
dplyr::count(ml, C_Age2)
dplyr::count(ml, C_Gender)
ml <- subset(ml, C_Gender == "Male" | C_Gender == "Female")
ml$C_Gender <- as.factor(ml$C_Gender)
dplyr::count(ml, C_Gender)
dplyr::count(ml, C_Diet)
ml <- subset(ml, C_Diet != "Other" | C_Diet != NA)
dplyr::count(ml, C_Diet)
ml$C_Diet_Vegan <- ml$C_Diet
for (i in 1:nrow(ml)) {
if (grepl("Vegan", ml$C_Diet_Vegan[i], fixed = TRUE)) {
ml$C_Diet_Vegan[i] <- "Yes"
} else {
ml$C_Diet_Vegan[i] <- "No"
}
}
dplyr::count(ml, C_Diet_Vegan)
ml$C_Diet_Vegan_Veggie <- ml$C_Diet
for (i in 1:nrow(ml)) {
if (grepl("Veg", ml$C_Diet_Vegan_Veggie[i], fixed = TRUE)) {
ml$C_Diet_Vegan_Veggie[i] <- "Yes"
} else {
ml$C_Diet_Vegan_Veggie[i] <- "No"
}
}
dplyr::count(ml, C_Diet_Vegan_Veggie)
for (i in 1:nrow(ml)) {
if (grepl("Toy", ml$Size[i], fixed = TRUE)) {
ml$Size[i] <- "Toy"
} else if (grepl("Small", ml$Size[i], fixed = TRUE)) {
ml$Size[i] <- "Small"
} else if (grepl("Medium", ml$Size[i], fixed = TRUE)) {
ml$Size[i] <- "Medium"
} else if (grepl("Large", ml$Size[i], fixed = TRUE)) {
ml$Size[i] <- "Large"
} else {
ml$Size[i] <- "Giant"
}
}
ml$Size <- ordered(ml$Size, levels = c("Toy", "Small", "Medium",
"Large", "Giant"))
dplyr::count(ml, Size)
ml$Size2 <- factor(ml$Size, levels = c("Medium", "Toy", "Small",
"Large", "Giant"), ordered = FALSE)
dplyr::count(ml, Size2)
Giant = c("Yes", "No")
ml$Size_Giant <- ml$Size2
ml$Size_Giant <- str_replace(ml$Size_Giant, "Giant", "Yes")
ml$Size_Giant <- str_replace(ml$Size_Giant, "Toy", "No")
ml$Size_Giant <- str_replace(ml$Size_Giant, "Medium", "No")
ml$Size_Giant <- str_replace(ml$Size_Giant, "Small", "No")
ml$Size_Giant <- str_replace(ml$Size_Giant, "Large", "No")
dplyr::count(ml, Size_Giant)
ml$Therapeutic_Food <- as.character(ml$Therapeutic_Food)
ml <- subset(ml, Therapeutic_Food != "NA" | Therapeutic_Food !=
NA)
dplyr::count(ml, Therapeutic_Food)
ml <- subset(ml, Visits != "NA")
dplyr::count(ml, Visits)
ml <- subset(ml, Visits != "Unsure" | Visits != NA)
ml <- subset(ml, Visits != "NA" | Visits != NA)
dplyr::count(ml, Visits)
ml$Visits <- str_replace(ml$Visits, ">3", "3<")
ml$Visits <- str_replace(ml$Visits, "0", "0")
ml$Visits <- str_replace(ml$Visits, "1", "1")
ml$Visits <- str_replace(ml$Visits, "2", "2")
ml$Visits <- str_replace(ml$Visits, "3", "3")
dplyr::count(ml, ml$Visits)
ml$Visits2 <- ml$Visits
ml$Visits2 <- str_replace(ml$Visits2, "3<", "3")
ml$Visits2 <- str_replace(ml$Visits2, "3", "3v")
ml$Visits2 <- str_replace(ml$Visits2, "0", "0v")
ml$Visits2 <- str_replace(ml$Visits2, "1", "1v")
ml$Visits2 <- str_replace(ml$Visits2, "2", "2v")
dplyr::count(ml, ml$Visits2)
ml$Meds <- as.factor(ml$Meds)
dplyr::count(ml, Meds)
dplyr::count(ml, ml$Health)
ml <- subset(ml, Health != "Unsure" | Health != NA)
ml$Health <- as.factor(ml$Health)
ml$Health <- ordered(ml$Health, levels = c("Healthy", "Generally healthy with minor or infrequent problems",
"Significant or frequent problems", "Seriously ill"))
dplyr::count(ml, Health)
Health_severity = c("0_None", "1_Mild", "2_Significant")
ml$Health2 <- ml$Health
ml$Health2 <- str_replace(ml$Health2, "Healthy", "0_None")
ml$Health2 <- str_replace(ml$Health2, "Generally healthy with minor or infrequent problems",
"1_Mild")
ml$Health2 <- str_replace(ml$Health2, "Significant or frequent problems",
"2_Significant")
ml$Health2 <- str_replace(ml$Health2, "Seriously ill", "2_Significant")
ml$Health2 <- ordered(ml$Health2, levels = c("0_None", "1_Mild",
"2_Significant"))
dplyr::count(ml, ml$Health2)
bhealth <- rep(0, nrow(ml))
for (i in 1:nrow(ml)) {
if (grepl("healthy", ml$Health[i], ignore.case = TRUE)) {
# 0 is good health
bhealth[i] <- 0
} else {
bhealth[i] <- 1
}
}
# Keep as a variable not factor!
ml$Health_Binary <- as.numeric(bhealth)
rm(bhealth)
# Checks and Balances, hey it's VERY unbalanced
dplyr::count(ml, Health_Binary)
bhealth2 <- rep(0, nrow(ml))
for (i in 1:nrow(ml)) {
if (grepl("0_None", ml$Health2[i], ignore.case = TRUE)) {
# 0 is good health
bhealth2[i] <- 0
} else {
bhealth2[i] <- 1
}
}
# Keep as a variable not factor!
ml$Any_Health_Problem <- as.numeric(bhealth2)
rm(bhealth2)
# Checks and Balances, better balanced this time.
dplyr::count(ml, Any_Health_Problem)
dplyr::count(ml, D_Diet)
ml <- subset(ml, D_Diet != "Unsure" | D_Diet != NA)
dplyr::count(ml, D_Diet)
ml <- subset(ml, D_Diet != "Mixture" | D_Diet != NA)
ml <- subset(ml, D_Diet != "Insect-based" | D_Diet != NA)
ml <- subset(ml, D_Diet != "Meat-based – lab-grown" | D_Diet !=
NA)
dplyr::count(ml, D_Diet)
ml$D_Diet_Vegan <- ml$D_Diet
for (i in 1:nrow(ml)) {
if (grepl("Vegan", ml$D_Diet_Vegan[i], fixed = TRUE)) {
ml$D_Diet_Vegan[i] <- "Yes"
} else {
ml$D_Diet_Vegan[i] <- "No"
}
}
dplyr::count(ml, D_Diet_Vegan)
ml$D_Diet_Raw <- ml$D_Diet
for (i in 1:nrow(ml)) {
if (grepl("raw", ml$D_Diet_Raw[i], fixed = TRUE)) {
ml$D_Diet_Raw[i] <- "Yes"
} else {
ml$D_Diet_Raw[i] <- "No"
}
}
dplyr::count(ml, D_Diet_Raw)
ml$D_Diet_Vegan_Veggie <- ml$D_Diet
for (i in 1:nrow(ml)) {
if (grepl("Veg", ml$D_Diet_Vegan_Veggie[i], fixed = TRUE)) {
ml$D_Diet_Vegan_Veggie[i] <- "Yes"
} else {
ml$D_Diet_Vegan_Veggie[i] <- "No"
}
}
dplyr::count(ml, D_Diet_Vegan_Veggie)
dplyr::count(ml, Decision_Maker)
ml <- subset(ml, Decision_Maker == "Primary decision-maker")
dplyr::count(ml, Decision_Maker)
ml <- subset(ml, select = c("Location", "setting", "Urban", "Education",
"Education_S", "Education_S2", "Animal_Career", "Animal_Career2",
"Animal_Career_BINARY", "Income", "Income2", "C_Age", "C_Age2",
"C_Gender", "C_Diet", "C_Diet_Vegan", "C_Diet_Vegan_Veggie",
"D_Age", "D_Age_quant", "Size", "Size2", "Size_Giant", "D_Sex",
"D_Neuter", "Therapeutic_Food", "Meds", "D_Diet", "D_Diet_Vegan",
"D_Diet_Vegan_Veggie", "D_Diet_Raw", "Visits", "Visits2",
"Health", "Health2", "Health_Binary", "Any_Health_Problem",
"Location_Europe"))
# check columns
head(ml)
describe(ml)
## ml
##
## 37 Variables 2211 Observations
## --------------------------------------------------------------------------------
## Location
## n missing distinct
## 2211 0 5
##
## UK (1609, 0.728), Other European (318, 0.144), North America (119, 0.054),
## Australia/New Zealand/Oceania (97, 0.044), Other (68, 0.031)
## --------------------------------------------------------------------------------
## setting
## n missing distinct
## 2211 0 3
##
## Value Urban Rural
## Frequency 780 728
## Proportion 0.353 0.329
##
## Value Equally urban and rural
## Frequency 703
## Proportion 0.318
## --------------------------------------------------------------------------------
## Urban
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 1431 780
## Proportion 0.647 0.353
## --------------------------------------------------------------------------------
## Education
## n missing distinct
## 2211 0 6
##
## Value 0_Basic 1_HighSchool 2_College 3_Grad 4_PG
## Frequency 29 385 636 645 442
## Proportion 0.013 0.174 0.288 0.292 0.200
##
## Value 5_PhD
## Frequency 74
## Proportion 0.033
## --------------------------------------------------------------------------------
## Education_S
## n missing distinct
## 2211 0 4
##
## Value 0_Basic_or_HighSchool 1_College 2_Grad
## Frequency 414 636 645
## Proportion 0.187 0.288 0.292
##
## Value 3_PG_or_PhD
## Frequency 516
## Proportion 0.233
## --------------------------------------------------------------------------------
## Education_S2
## n missing distinct
## 2211 0 4
##
## Value 0_Basic_or_HighSchool 1_College 2_Grad
## Frequency 414 636 645
## Proportion 0.187 0.288 0.292
##
## Value 3_PG_or_PhD
## Frequency 516
## Proportion 0.233
## --------------------------------------------------------------------------------
## Animal_Career
## n missing distinct
## 2211 0 6
##
## Animal breeder (15, 0.007), Animal trainer (122, 0.055), None of the above
## (1786, 0.808), Pet industry worker (181, 0.082), Veterinarian (57, 0.026),
## Veterinary technician or nurse (50, 0.023)
## --------------------------------------------------------------------------------
## Animal_Career2
## n missing distinct
## 2211 0 4
##
## Value None of the above Vet professional breeder/trainer
## Frequency 1786 107 137
## Proportion 0.808 0.048 0.062
##
## Value Pet industry worker
## Frequency 181
## Proportion 0.082
## --------------------------------------------------------------------------------
## Animal_Career_BINARY
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 1786 425
## Proportion 0.808 0.192
## --------------------------------------------------------------------------------
## Income
## n missing distinct
## 2211 0 3
##
## Value Low Medium High
## Frequency 352 1530 329
## Proportion 0.159 0.692 0.149
## --------------------------------------------------------------------------------
## Income2
## n missing distinct
## 2211 0 3
##
## Value Low Medium High
## Frequency 352 1530 329
## Proportion 0.159 0.692 0.149
## --------------------------------------------------------------------------------
## C_Age
## n missing distinct
## 2211 0 5
##
## Value <30 30–39 40–49 50–59 60<
## Frequency 340 482 452 515 422
## Proportion 0.154 0.218 0.204 0.233 0.191
## --------------------------------------------------------------------------------
## C_Age2
## n missing distinct
## 2211 0 5
##
## Value <30 30–39 40–49 50–59 60<
## Frequency 340 482 452 515 422
## Proportion 0.154 0.218 0.204 0.233 0.191
## --------------------------------------------------------------------------------
## C_Gender
## n missing distinct
## 2211 0 2
##
## Value Female Male
## Frequency 2055 156
## Proportion 0.929 0.071
## --------------------------------------------------------------------------------
## C_Diet
## n missing distinct
## 2211 0 5
##
## lowest : Omnivore (i.e. eat meat) Omnivore reducing animal product consumption Pescatarian (including fish but no other meats) Vegan (consuming no animal products) Vegetarian (consuming plants, eggs and milk)
## highest: Omnivore (i.e. eat meat) Omnivore reducing animal product consumption Pescatarian (including fish but no other meats) Vegan (consuming no animal products) Vegetarian (consuming plants, eggs and milk)
## --------------------------------------------------------------------------------
## C_Diet_Vegan
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 1718 493
## Proportion 0.777 0.223
## --------------------------------------------------------------------------------
## C_Diet_Vegan_Veggie
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 1497 714
## Proportion 0.677 0.323
## --------------------------------------------------------------------------------
## D_Age
## n missing distinct Info Mean Gmd .05 .10
## 2211 0 19 0.993 6.177 4.103 1 2
## .25 .50 .75 .90 .95
## 3 6 9 11 13
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 143 243 245 229 229 195 184 171 137 127 97
## Proportion 0.065 0.110 0.111 0.104 0.104 0.088 0.083 0.077 0.062 0.057 0.044
##
## Value 12 13 14 15 16 17 18 20
## Frequency 67 54 48 26 7 7 1 1
## Proportion 0.030 0.024 0.022 0.012 0.003 0.003 0.000 0.000
##
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## D_Age_quant
## n missing distinct
## 2211 0 5
##
## Value [1,3] (3,5] (5,7] (7,9.4] (9.4,20]
## Frequency 631 458 379 308 435
## Proportion 0.285 0.207 0.171 0.139 0.197
## --------------------------------------------------------------------------------
## Size
## n missing distinct
## 2211 0 5
##
## Value Toy Small Medium Large Giant
## Frequency 52 439 859 764 97
## Proportion 0.024 0.199 0.389 0.346 0.044
## --------------------------------------------------------------------------------
## Size2
## n missing distinct
## 2211 0 5
##
## Value Medium Toy Small Large Giant
## Frequency 859 52 439 764 97
## Proportion 0.389 0.024 0.199 0.346 0.044
## --------------------------------------------------------------------------------
## Size_Giant
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 2114 97
## Proportion 0.956 0.044
## --------------------------------------------------------------------------------
## D_Sex
## n missing distinct
## 2211 0 2
##
## Value Female Male
## Frequency 1042 1169
## Proportion 0.471 0.529
## --------------------------------------------------------------------------------
## D_Neuter
## n missing distinct
## 2211 0 2
##
## Value Intact Neutered
## Frequency 486 1725
## Proportion 0.22 0.78
## --------------------------------------------------------------------------------
## Therapeutic_Food
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 2111 100
## Proportion 0.955 0.045
## --------------------------------------------------------------------------------
## Meds
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 1331 880
## Proportion 0.602 0.398
## --------------------------------------------------------------------------------
## D_Diet
## n missing distinct
## 2211 0 4
##
## Meat-based – conventional (1152, 0.521), Meat-based – raw (732, 0.331), Vegan
## (consuming no animal products) (295, 0.133), Vegetarian (including eggs or
## milk, but not meat) (32, 0.014)
## --------------------------------------------------------------------------------
## D_Diet_Vegan
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 1916 295
## Proportion 0.867 0.133
## --------------------------------------------------------------------------------
## D_Diet_Vegan_Veggie
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 1884 327
## Proportion 0.852 0.148
## --------------------------------------------------------------------------------
## D_Diet_Raw
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 1479 732
## Proportion 0.669 0.331
## --------------------------------------------------------------------------------
## Visits
## n missing distinct
## 2211 0 5
##
## Value 0 1 2 3 3<
## Frequency 399 873 473 185 281
## Proportion 0.180 0.395 0.214 0.084 0.127
## --------------------------------------------------------------------------------
## Visits2
## n missing distinct
## 2211 0 4
##
## Value 0v 1v 2v 3v
## Frequency 399 873 473 466
## Proportion 0.180 0.395 0.214 0.211
## --------------------------------------------------------------------------------
## Health
## n missing distinct
## 2211 0 4
##
## --------------------------------------------------------------------------------
## Health2
## n missing distinct
## 2211 0 3
##
## Value 0_None 1_Mild 2_Significant
## Frequency 1375 714 122
## Proportion 0.622 0.323 0.055
## --------------------------------------------------------------------------------
## Health_Binary
## n missing distinct Info Sum Mean Gmd
## 2211 0 2 0.156 122 0.05518 0.1043
##
## --------------------------------------------------------------------------------
## Any_Health_Problem
## n missing distinct Info Sum Mean Gmd
## 2211 0 2 0.705 836 0.3781 0.4705
##
## --------------------------------------------------------------------------------
## Location_Europe
## n missing distinct
## 2211 0 2
##
## Value No Yes
## Frequency 1893 318
## Proportion 0.856 0.144
## --------------------------------------------------------------------------------
dplyr::count(ml, C_Diet)
## two way cross tabs (xtabs) and flatten the table
ftable(xtabs(~D_Sex + D_Neuter, data = ml))
## D_Neuter Intact Neutered
## D_Sex
## Female 192 850
## Male 294 875
index <- sample(nrow(ml), nrow(ml) * 0.75)
# set seed to make partition
set.seed(123)
ml_train = ml[index, ]
ml_test = ml[-index, ]
citation("stringr")
##
## To cite package 'stringr' in publications use:
##
## Wickham H (2022). _stringr: Simple, Consistent Wrappers for Common
## String Operations_. R package version 1.5.0,
## <https://CRAN.R-project.org/package=stringr>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {stringr: Simple, Consistent Wrappers for Common String Operations},
## author = {Hadley Wickham},
## year = {2022},
## note = {R package version 1.5.0},
## url = {https://CRAN.R-project.org/package=stringr},
## }
packageVersion("stringr")
## [1] '1.5.0'
citation("dplyr")
##
## To cite package 'dplyr' in publications use:
##
## Wickham H, François R, Henry L, Müller K, Vaughan D (2023). _dplyr: A
## Grammar of Data Manipulation_. R package version 1.1.3,
## <https://CRAN.R-project.org/package=dplyr>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {dplyr: A Grammar of Data Manipulation},
## author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan},
## year = {2023},
## note = {R package version 1.1.3},
## url = {https://CRAN.R-project.org/package=dplyr},
## }
packageVersion("dplyr")
## [1] '1.1.3'
citation("DescTools")
##
## To cite package 'DescTools' in publications use:
##
## Signorell A (2023). _DescTools: Tools for Descriptive Statistics_. R
## package version 0.99.50,
## <https://CRAN.R-project.org/package=DescTools>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {DescTools: Tools for Descriptive Statistics},
## author = {Andri Signorell},
## year = {2023},
## note = {R package version 0.99.50},
## url = {https://CRAN.R-project.org/package=DescTools},
## }
packageVersion("DescTools")
## [1] '0.99.50'
citation("car")
##
## To cite the car package in publications use:
##
## Fox J, Weisberg S (2019). _An R Companion to Applied Regression_,
## Third edition. Sage, Thousand Oaks CA.
## <https://socialsciences.mcmaster.ca/jfox/Books/Companion/>.
##
## A BibTeX entry for LaTeX users is
##
## @Book{,
## title = {An {R} Companion to Applied Regression},
## edition = {Third},
## author = {John Fox and Sanford Weisberg},
## year = {2019},
## publisher = {Sage},
## address = {Thousand Oaks {CA}},
## url = {https://socialsciences.mcmaster.ca/jfox/Books/Companion/},
## }
packageVersion("car")
## [1] '3.1.2'
citation("foreign")
##
## To cite package 'foreign' in publications use:
##
## R Core Team (2023). _foreign: Read Data Stored by 'Minitab', 'S',
## 'SAS', 'SPSS', 'Stata', 'Systat', 'Weka', 'dBase', ..._. R package
## version 0.8-85, <https://CRAN.R-project.org/package=foreign>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {foreign: Read Data Stored by 'Minitab', 'S', 'SAS', 'SPSS', 'Stata',
## 'Systat', 'Weka', 'dBase', ...},
## author = {{R Core Team}},
## year = {2023},
## note = {R package version 0.8-85},
## url = {https://CRAN.R-project.org/package=foreign},
## }
packageVersion("foreign")
## [1] '0.8.85'
citation("ggplot2")
##
## To cite ggplot2 in publications, please use
##
## H. Wickham. ggplot2: Elegant Graphics for Data Analysis.
## Springer-Verlag New York, 2016.
##
## A BibTeX entry for LaTeX users is
##
## @Book{,
## author = {Hadley Wickham},
## title = {ggplot2: Elegant Graphics for Data Analysis},
## publisher = {Springer-Verlag New York},
## year = {2016},
## isbn = {978-3-319-24277-4},
## url = {https://ggplot2.tidyverse.org},
## }
packageVersion("ggplot2")
## [1] '3.4.3'
citation("Hmisc")
##
## To cite package 'Hmisc' in publications use:
##
## Harrell Jr F (2023). _Hmisc: Harrell Miscellaneous_. R package
## version 5.1-1, <https://CRAN.R-project.org/package=Hmisc>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {Hmisc: Harrell Miscellaneous},
## author = {Frank E {Harrell Jr}},
## year = {2023},
## note = {R package version 5.1-1},
## url = {https://CRAN.R-project.org/package=Hmisc},
## }
packageVersion("Hmisc")
## [1] '5.1.1'
citation("reshape2")
##
## To cite reshape2 in publications use:
##
## Hadley Wickham (2007). Reshaping Data with the reshape Package.
## Journal of Statistical Software, 21(12), 1-20. URL
## http://www.jstatsoft.org/v21/i12/.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {Reshaping Data with the {reshape} Package},
## author = {Hadley Wickham},
## journal = {Journal of Statistical Software},
## year = {2007},
## volume = {21},
## number = {12},
## pages = {1--20},
## url = {http://www.jstatsoft.org/v21/i12/},
## }
packageVersion("reshape2")
## [1] '1.4.4'
citation("tidyverse")
##
## To cite package 'tidyverse' in publications use:
##
## Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R,
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller
## E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V,
## Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to
## the tidyverse." _Journal of Open Source Software_, *4*(43), 1686.
## doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {Welcome to the {tidyverse}},
## author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
## year = {2019},
## journal = {Journal of Open Source Software},
## volume = {4},
## number = {43},
## pages = {1686},
## doi = {10.21105/joss.01686},
## }
packageVersion("tidyverse")
## [1] '2.0.0'
citation("broom")
##
## To cite package 'broom' in publications use:
##
## Robinson D, Hayes A, Couch S (2023). _broom: Convert Statistical
## Objects into Tidy Tibbles_. R package version 1.0.5,
## <https://CRAN.R-project.org/package=broom>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {broom: Convert Statistical Objects into Tidy Tibbles},
## author = {David Robinson and Alex Hayes and Simon Couch},
## year = {2023},
## note = {R package version 1.0.5},
## url = {https://CRAN.R-project.org/package=broom},
## }
packageVersion("broom")
## [1] '1.0.5'
citation("aod")
##
## To cite package 'aod' in publications use:
##
## Lesnoff, M., Lancelot, R. (2012). aod: Analysis of Overdispersed
## Data. R package version 1.3.2, URL
## http://cran.r-project.org/package=aod
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {aod: Analysis of Overdispersed Data},
## author = {{Lesnoff} and {M.} and {Lancelot} and {R.}},
## year = {2012},
## note = {R package version 1.3.2},
## url = {https://cran.r-project.org/package=aod},
## }
packageVersion("aod")
## [1] '1.3.2'
citation("glmtoolbox")
##
## To cite package 'glmtoolbox' in publications use:
##
## Vanegas L, Rondón L, Paula G (2023). _glmtoolbox: Set of Tools to
## Data Analysis using Generalized Linear Models_. R package version
## 0.1.9, <https://CRAN.R-project.org/package=glmtoolbox>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {glmtoolbox: Set of Tools to Data Analysis using Generalized Linear Models},
## author = {Luis Hernando Vanegas and Luz Marina Rondón and Gilberto A. Paula},
## year = {2023},
## note = {R package version 0.1.9},
## url = {https://CRAN.R-project.org/package=glmtoolbox},
## }
packageVersion("glmtoolbox")
## [1] '0.1.9'
citation("fmsb")
##
## To cite package 'fmsb' in publications use:
##
## Nakazawa M (2023). _fmsb: Functions for Medical Statistics Book with
## some Demographic Data_. R package version 0.7.5,
## <https://CRAN.R-project.org/package=fmsb>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {fmsb: Functions for Medical Statistics Book with some Demographic Data},
## author = {Minato Nakazawa},
## year = {2023},
## note = {R package version 0.7.5},
## url = {https://CRAN.R-project.org/package=fmsb},
## }
##
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
packageVersion("fmsb")
## [1] '0.7.5'
citation("uwot")
##
## To cite package 'uwot' in publications use:
##
## Melville J (2023). _uwot: The Uniform Manifold Approximation and
## Projection (UMAP) Method for Dimensionality Reduction_. R package
## version 0.1.16, <https://CRAN.R-project.org/package=uwot>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {uwot: The Uniform Manifold Approximation and Projection (UMAP) Method
## for Dimensionality Reduction},
## author = {James Melville},
## year = {2023},
## note = {R package version 0.1.16},
## url = {https://CRAN.R-project.org/package=uwot},
## }
packageVersion("uwot")
## [1] '0.1.16'
citation("ggthemes")
##
## To cite package 'ggthemes' in publications use:
##
## Arnold J (2021). _ggthemes: Extra Themes, Scales and Geoms for
## 'ggplot2'_. R package version 4.2.4,
## <https://CRAN.R-project.org/package=ggthemes>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {ggthemes: Extra Themes, Scales and Geoms for 'ggplot2'},
## author = {Jeffrey B. Arnold},
## year = {2021},
## note = {R package version 4.2.4},
## url = {https://CRAN.R-project.org/package=ggthemes},
## }
packageVersion("ggthemes")
## [1] '4.2.4'
citation("ggforce")
##
## To cite package 'ggforce' in publications use:
##
## Pedersen T (2022). _ggforce: Accelerating 'ggplot2'_. R package
## version 0.4.1, <https://CRAN.R-project.org/package=ggforce>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {ggforce: Accelerating 'ggplot2'},
## author = {Thomas Lin Pedersen},
## year = {2022},
## note = {R package version 0.4.1},
## url = {https://CRAN.R-project.org/package=ggforce},
## }
packageVersion("ggforce")
## [1] '0.4.1'
citation("readxl")
##
## To cite package 'readxl' in publications use:
##
## Wickham H, Bryan J (2023). _readxl: Read Excel Files_. R package
## version 1.4.3, <https://CRAN.R-project.org/package=readxl>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {readxl: Read Excel Files},
## author = {Hadley Wickham and Jennifer Bryan},
## year = {2023},
## note = {R package version 1.4.3},
## url = {https://CRAN.R-project.org/package=readxl},
## }
packageVersion("readxl")
## [1] '1.4.3'
citation("ROCR")
##
## To cite ROCR in publications use:
##
## Sing T, Sander O, Beerenwinkel N, Lengauer T (2005). "ROCR:
## visualizing classifier performance in R." _Bioinformatics_, *21*(20),
## 7881. <http://rocr.bioinf.mpi-sb.mpg.de>.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## entry = {article},
## title = {ROCR: visualizing classifier performance in R},
## author = {T. Sing and O. Sander and N. Beerenwinkel and T. Lengauer},
## year = {2005},
## journal = {Bioinformatics},
## volume = {21},
## number = {20},
## pages = {7881},
## url = {http://rocr.bioinf.mpi-sb.mpg.de},
## }
##
## We have invested a lot of time and effort in creating ROCR, please cite
## it when using it for data analysis.
packageVersion("ROCR")
## [1] '1.0.11'
citation("pROC")
##
## If you use pROC in published research, please cite the following paper:
##
## Xavier Robin, Natacha Turck, Alexandre Hainard, Natalia Tiberti,
## Frédérique Lisacek, Jean-Charles Sanchez and Markus Müller (2011).
## pROC: an open-source package for R and S+ to analyze and compare ROC
## curves. BMC Bioinformatics, 12, p. 77. DOI: 10.1186/1471-2105-12-77
## <http://www.biomedcentral.com/1471-2105/12/77/>
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {pROC: an open-source package for R and S+ to analyze and compare ROC curves},
## author = {Xavier Robin and Natacha Turck and Alexandre Hainard and Natalia Tiberti and Frédérique Lisacek and Jean-Charles Sanchez and Markus Müller},
## year = {2011},
## journal = {BMC Bioinformatics},
## volume = {12},
## pages = {77},
## }
packageVersion("pROC")
## [1] '1.18.4'
citation("caret")
##
## To cite caret in publications use:
##
## Kuhn, M. (2008). Building Predictive Models in R Using the caret
## Package. Journal of Statistical Software, 28(5), 1–26.
## https://doi.org/10.18637/jss.v028.i05
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## title = {Building Predictive Models in R Using the caret Package},
## volume = {28},
## url = {https://www.jstatsoft.org/index.php/jss/article/view/v028i05},
## doi = {10.18637/jss.v028.i05},
## number = {5},
## journal = {Journal of Statistical Software},
## author = {{Kuhn} and {Max}},
## year = {2008},
## pages = {1–26},
## }
packageVersion("caret")
## [1] '6.0.94'
citation("xgboost")
##
## To cite package 'xgboost' in publications use:
##
## Chen T, He T, Benesty M, Khotilovich V, Tang Y, Cho H, Chen K,
## Mitchell R, Cano I, Zhou T, Li M, Xie J, Lin M, Geng Y, Li Y, Yuan J
## (2023). _xgboost: Extreme Gradient Boosting_. R package version
## 1.7.5.1, <https://CRAN.R-project.org/package=xgboost>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {xgboost: Extreme Gradient Boosting},
## author = {Tianqi Chen and Tong He and Michael Benesty and Vadim Khotilovich and Yuan Tang and Hyunsu Cho and Kailong Chen and Rory Mitchell and Ignacio Cano and Tianyi Zhou and Mu Li and Junyuan Xie and Min Lin and Yifeng Geng and Yutian Li and Jiaming Yuan},
## year = {2023},
## note = {R package version 1.7.5.1},
## url = {https://CRAN.R-project.org/package=xgboost},
## }
packageVersion("xgboost")
## [1] '1.7.5.1'
citation("smotefamily")
##
## To cite package 'smotefamily' in publications use:
##
## Siriseriwan W (2019). _smotefamily: A Collection of Oversampling
## Techniques for Class Imbalance Problem Based on SMOTE_. R package
## version 1.3.1, <https://CRAN.R-project.org/package=smotefamily>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {smotefamily: A Collection of Oversampling Techniques for Class Imbalance
## Problem Based on SMOTE},
## author = {Wacharasak Siriseriwan},
## year = {2019},
## note = {R package version 1.3.1},
## url = {https://CRAN.R-project.org/package=smotefamily},
## }
packageVersion("smotefamily")
## [1] '1.3.1'
citation("rsample")
##
## To cite package 'rsample' in publications use:
##
## Frick H, Chow F, Kuhn M, Mahoney M, Silge J, Wickham H (2023).
## _rsample: General Resampling Infrastructure_. R package version
## 1.2.0, <https://CRAN.R-project.org/package=rsample>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {rsample: General Resampling Infrastructure},
## author = {Hannah Frick and Fanny Chow and Max Kuhn and Michael Mahoney and Julia Silge and Hadley Wickham},
## year = {2023},
## note = {R package version 1.2.0},
## url = {https://CRAN.R-project.org/package=rsample},
## }
packageVersion("rsample")
## [1] '1.2.0'
citation("mltools")
##
## To cite package 'mltools' in publications use:
##
## Gorman B (2018). _mltools: Machine Learning Tools_. R package version
## 0.3.5, <https://CRAN.R-project.org/package=mltools>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {mltools: Machine Learning Tools},
## author = {Ben Gorman},
## year = {2018},
## note = {R package version 0.3.5},
## url = {https://CRAN.R-project.org/package=mltools},
## }
##
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
packageVersion("mltools")
## [1] '0.3.5'
citation("data.table")
##
## To cite package 'data.table' in publications use:
##
## Dowle M, Srinivasan A (2023). _data.table: Extension of
## `data.frame`_. R package version 1.14.8,
## <https://CRAN.R-project.org/package=data.table>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {data.table: Extension of `data.frame`},
## author = {Matt Dowle and Arun Srinivasan},
## year = {2023},
## note = {R package version 1.14.8},
## url = {https://CRAN.R-project.org/package=data.table},
## }
packageVersion("data.table")
## [1] '1.14.8'
citation("heatmaply")
##
## The methods within the package can be cited as:
##
## Galili, Tal, O'Callaghan, Alan, Sidi, Jonathan, Sievert, Carson
## (2017). "heatmaply: an R package for creating interactive cluster
## heatmaps for online publishing." _Bioinformatics_.
## doi:10.1093/bioinformatics/btx657
## <https://doi.org/10.1093/bioinformatics/btx657>,
## <https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btx657/21358327/btx657.pdf>.
##
## A BibTeX entry for LaTeX users is
##
## @Article{,
## author = {{Galili} and {Tal} and {O'Callaghan} and {Alan} and {Sidi} and {Jonathan} and {Sievert} and {Carson}},
## title = {heatmaply: an R package for creating interactive cluster heatmaps for online publishing},
## journal = {Bioinformatics},
## year = {2017},
## doi = {10.1093/bioinformatics/btx657},
## url = {https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btx657/21358327/btx657.pdf},
## }
##
## This free open-source software implements academic research by the
## authors and co-workers. If you use it, please support the project by
## citing the appropriate journal articles.
packageVersion("heatmaply")
## [1] '1.5.0'
citation("rstatix")
##
## To cite package 'rstatix' in publications use:
##
## Kassambara A (2023). _rstatix: Pipe-Friendly Framework for Basic
## Statistical Tests_. R package version 0.7.2,
## <https://CRAN.R-project.org/package=rstatix>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {rstatix: Pipe-Friendly Framework for Basic Statistical Tests},
## author = {Alboukadel Kassambara},
## year = {2023},
## note = {R package version 0.7.2},
## url = {https://CRAN.R-project.org/package=rstatix},
## }
packageVersion("rstatix")
## [1] '0.7.2'
citation("dbscan")
##
## Hahsler M, Piekenbrock M (2022). _dbscan: Density-Based Spatial
## Clustering of Applications with Noise (DBSCAN) and Related Algorithms_.
## R package version 1.1-11, <https://CRAN.R-project.org/package=dbscan>.
##
## To cite dbscan in publications use:
##
## Hahsler M, Piekenbrock M, Doran D (2019). "dbscan: Fast Density-Based
## Clustering with R." _Journal of Statistical Software_, *91*(1), 1-30.
## doi:10.18637/jss.v091.i01 <https://doi.org/10.18637/jss.v091.i01>.
##
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
packageVersion("dbscan")
## [1] '1.1.11'
citation("corrplot")
##
## To cite corrplot in publications use:
##
## Taiyun Wei and Viliam Simko (2021). R package 'corrplot':
## Visualization of a Correlation Matrix (Version 0.92). Available from
## https://github.com/taiyun/corrplot
##
## A BibTeX entry for LaTeX users is
##
## @Manual{corrplot2021,
## title = {R package 'corrplot': Visualization of a Correlation Matrix},
## author = {Taiyun Wei and Viliam Simko},
## year = {2021},
## note = {(Version 0.92)},
## url = {https://github.com/taiyun/corrplot},
## }
packageVersion("corrplot")
## [1] '0.92'
citation("showtext")
##
## To cite package 'showtext' in publications use:
##
## Qiu Y, details. aotisSfAf (2023). _showtext: Using Fonts More Easily
## in R Graphs_. R package version 0.9-6,
## <https://CRAN.R-project.org/package=showtext>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {showtext: Using Fonts More Easily in R Graphs},
## author = {Yixuan Qiu and authors/contributors of the included software. See file AUTHORS for details.},
## year = {2023},
## note = {R package version 0.9-6},
## url = {https://CRAN.R-project.org/package=showtext},
## }
##
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
packageVersion("showtext")
## [1] '0.9.6'
citation("cluster")
##
## To cite the R package 'cluster' in publications use:
##
## Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M., Hornik,
## K.(2022). cluster: Cluster Analysis Basics and Extensions. R package
## version 2.1.4.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {cluster: Cluster Analysis Basics and Extensions},
## author = {Martin Maechler and Peter Rousseeuw and Anja Struyf and Mia Hubert and Kurt Hornik},
## year = {2022},
## url = {https://CRAN.R-project.org/package=cluster},
## note = {R package version 2.1.4 --- For new features, see the 'Changelog' file (in the package source)},
## }
packageVersion("cluster")
## [1] '2.1.4'
citation("factoextra")
##
## To cite package 'factoextra' in publications use:
##
## Kassambara A, Mundt F (2020). _factoextra: Extract and Visualize the
## Results of Multivariate Data Analyses_. R package version 1.0.7,
## <https://CRAN.R-project.org/package=factoextra>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {factoextra: Extract and Visualize the Results of Multivariate Data Analyses},
## author = {Alboukadel Kassambara and Fabian Mundt},
## year = {2020},
## note = {R package version 1.0.7},
## url = {https://CRAN.R-project.org/package=factoextra},
## }
packageVersion("factoextra")
## [1] '1.0.7'
citation("clusterSim")
##
## To cite clusterSim in publications use:
##
## Walesiak M, Dudek A (2020). "The Choice of Variable Normalization
## Method in Cluster Analysis." In Soliman KS (ed.), _Education
## Excellence and Innovation Management: A 2025 Vision to Sustain
## Economic Development During Global Challenges_, 325-340. ISBN
## 978-0-9998551-4-1.
##
## A BibTeX entry for LaTeX users is
##
## @InProceedings{,
## title = {The Choice of Variable Normalization Method in Cluster Analysis},
## author = {Marek Walesiak and Andrzej Dudek},
## booktitle = {Education Excellence and Innovation Management: A 2025 Vision to Sustain Economic Development During Global Challenges},
## booksubtitle = {Proceedings of the 35th International Business Information Management Association Conference (IBIMA), 1-2 April 2020, Seville, Spain},
## year = {2020},
## publisher = {International Business Information Management Association (IBIMA)},
## editor = {Khalid S. Soliman},
## pages = {325-340},
## isbn = {978-0-9998551-4-1},
## }
packageVersion("clusterSim")
## [1] '0.51.3'
citation("RColorBrewer")
##
## To cite package 'RColorBrewer' in publications use:
##
## Neuwirth E (2022). _RColorBrewer: ColorBrewer Palettes_. R package
## version 1.1-3, <https://CRAN.R-project.org/package=RColorBrewer>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {RColorBrewer: ColorBrewer Palettes},
## author = {Erich Neuwirth},
## year = {2022},
## note = {R package version 1.1-3},
## url = {https://CRAN.R-project.org/package=RColorBrewer},
## }
packageVersion("RColorBrewer")
## [1] '1.1.3'
citation("IHW")
##
## To cite package 'IHW' in publications use:
##
## Nikolaos Ignatiadis, Bernd Klaus, Judith Zaugg and Wolfgang Huber
## (2016): Data-driven hypothesis weighting increases detection power in
## genome-scale multiple testing. Nature Methods 13:577, doi:
## 10.1038/nmeth.3885
##
## Nikolaos Ignatiadis and Wolfgang Huber (2016): Covariate-powered
## weighted multiple testing with false discovery rate control.
## arXiv:1701.05179
##
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
packageVersion("IHW")
## [1] '1.26.0'
if (SAVE) {
save(ml, file = "ml_read_data_101.RData")
}